roseplot of the OLINK data

The goal of this notebook is to generate roseplots of the cytokine panels measured from plasma and CSF samples.

The roseplots for the marginalized data appear in Figure 3 A.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

1. reading the data

In [2]:
## first, grabbing molecule names from the original files (this is a bit dirty...)
df =pd.read_csv('data/Inflammation_Plasma_Etter.csv',sep=';', skiprows=[0,1,2,4,5,6] , index_col=0)
df.drop(columns=['Plate ID' , 'QC Warning'] , inplace=True)

inflammatoryMolecules = list( df.columns )

df =pd.read_csv('data/Neurology_Plasma_Etter.csv',sep=';', skiprows=[0,1,2,4,5,6] , index_col=0)
df.drop(columns=['Plate ID' , 'QC Warning'] , inplace=True)

neuroMolecules = list( df.columns )

## removing common molecules. we keep the ones in neuro
inter = set( inflammatoryMolecules ).intersection(neuroMolecules)
for c in inter:
    inflammatoryMolecules.remove(c)



print(len(inflammatoryMolecules))
print(len(neuroMolecules))

## now reading the molecular data
df_molecular = pd.read_csv( "imputed_data/ALL_imputed.csv" , index_col='Assay' )
df_molecular.head()

## and metadata
df_metadata = pd.read_csv('data/metadata.csv', index_col='studyID')
df_metadata.covid = df_metadata.covid.astype(bool)
df_metadata.Age = df_metadata.Age.astype(int)
df_metadata.head()
90
92
Out[2]:
Age Sex Group Stage covid
studyID
4927 38 f healthy control I False
5050 53 m healthy control I False
5052 22 m healthy control I False
5062 23 m healthy control I False
5092 47 f healthy control I False

back to ToC

2. preliminary tests

In [3]:
df_molecular_tmp = df_molecular.copy()
df_molecular_tmp["patient"] = df_molecular_tmp.index
df_molecular_long = df_molecular_tmp.melt( id_vars=['patient'])# ,ignore_index=False)
In [4]:
df_molecular_long.patient.value_counts()
Out[4]:
6346     364
5003     364
6440     364
5050     364
NC004    364
        ... 
6514     364
5814     364
NC028    364
5272     364
NC032    364
Name: patient, Length: 90, dtype: int64
In [5]:
df_molecular_long.head()
Out[5]:
patient variable value
0 6346 CSF_NMNAT1 0.693481
1 4995 CSF_NMNAT1 2.343750
2 5204 CSF_NMNAT1 2.194830
3 5652 CSF_NMNAT1 4.396180
4 NC029 CSF_NMNAT1 2.167530
In [6]:
df_molecular_long['Group'] = df_molecular_long.patient.replace( df_metadata.Group )

Let's see what we get when we just plot all this along a single axis

In [8]:
fig,ax = plt.subplots(figsize=(14,8))

sns.lineplot(x='variable' , y='value' , data= df_molecular_long , hue='Group' , ax=ax)
Out[8]:
<AxesSubplot:xlabel='variable', ylabel='value'>

OK, let's reduce this to plot a single panel

In [7]:
PImolec = list( map( lambda x : "Plasma_"+x , inflammatoryMolecules ) )
PNmolec = list( map( lambda x : "Plasma_"+x , neuroMolecules ) )
CImolec = list( map( lambda x : "CSF_"+x , inflammatoryMolecules ) )
CNmolec = list( map( lambda x : "CSF_"+x , neuroMolecules ) )
In [8]:
m = df_molecular_long.variable.isin( PImolec )
In [11]:
fig,ax = plt.subplots(figsize=(14,8))

sns.lineplot(x='variable' , y='value' , data= df_molecular_long.loc[m,:] , hue='Group' , ax=ax)
Out[11]:
<AxesSubplot:xlabel='variable', ylabel='value'>
In [9]:
def getZscore( c ):
    return ( c - c.mean() ) / c.std(ddof=0)

df_molecular_Z = df_molecular.apply(getZscore)
print("means")
print( df_molecular_Z.mean().describe() )
print("\nstds")
print( df_molecular_Z.std(ddof=0).describe() )
means
count    3.580000e+02
mean     5.980901e-17
std      1.761777e-15
min     -2.375137e-14
25%     -1.802480e-16
50%      4.179663e-17
75%      4.423205e-16
max      1.115709e-14
dtype: float64

stds
count    3.580000e+02
mean     1.000000e+00
std      1.077075e-16
min      1.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      1.000000e+00
dtype: float64
In [10]:
df_molecular_tmp = df_molecular_Z.copy()
df_molecular_tmp["patient"] = df_molecular_tmp.index
df_molecular_Z_long = df_molecular_tmp.melt( id_vars=['patient'])# ,ignore_index=False)
In [11]:
df_molecular_Z_long['Group'] = df_molecular_Z_long.patient.replace( df_metadata.Group )
In [12]:
df_molecular_Z_long.head()
Out[12]:
patient variable value Group
0 6346 CSF_NMNAT1 -0.969989 healthy control
1 4995 CSF_NMNAT1 -0.012901 inflammatory control
2 5204 CSF_NMNAT1 -0.099268 healthy control
3 5652 CSF_NMNAT1 1.177424 inflammatory control
4 NC029 CSF_NMNAT1 -0.115101 Non-ICU COVID
In [13]:
m = df_molecular_Z_long.variable.isin( PImolec )
In [17]:
fig,ax = plt.subplots(figsize=(14,8))

sns.lineplot(x='variable' , y='value' , data= df_molecular_Z_long.loc[m,:] , hue='Group' , ax=ax)
Out[17]:
<AxesSubplot:xlabel='variable', ylabel='value'>

This is quite better!

Potential improvement :

  • find a clever way to order columns. (most discriminative ? increasing ICU COVID mean?)

But first, we want to switch to a polar projection:

In [17]:
def makeRosePlot( df , groups, orderedMol , colorD={'ICU COVID' : '#a6611a', 'Non-ICU COVID' : '#dfc27d', 'healthy control' : '#80cdc1','inflammatory control' : '#018571' }, linewidth= 1.0 , alpha = 0.2 ):
    
    angles = np.linspace( 0, 2*np.pi * (1-1/len( orderedMol ))  , len( orderedMol ) )


    fig = plt.figure(figsize = (14,14))
    ax = fig.add_subplot(projection='polar')

    for g in groups:

        #print(g)
        M = df.loc[ g , orderedMol , : ].value

        S = np.array(df.loc[ g , orderedMol , : ]["std"])
        C = np.array(df.loc[ g , orderedMol , : ]["count"])
        SEM = S / np.sqrt(C)

        #print(len(M),len(S),len(C))
        c = ax.plot(angles, M, c= colorD[g] , label=g , linewidth =linewidth )
        ax.fill_between( angles , M+SEM , M-SEM , color =colorD[g],  alpha=alpha )

    ## handling labels    
    ax.set_xticks(angles)

    Xlabels = []
    for a,l in  zip( angles , orderedMol ) :

        m = l.partition('_')[2]
        if a <= np.pi*0.5 or a > np.pi*1.5:
            m = m.ljust(20)
        else:
            m = m.rjust(20)

        Xlabels.append(m)


    ax.set_xticklabels( Xlabels ,
                      fontfamily='monospace')#, rotation='vertical')

    angleLabels = np.linspace(0,2*np.pi,len(ax.get_xticklabels())+1)
    angleLabels[np.cos(angleLabels) < 0] = angleLabels[np.cos(angleLabels) < 0] + np.pi
    angleLabels = np.rad2deg(angleLabels)
    labels=[]
    for label, angle in zip(ax.get_xticklabels(), angleLabels):
        x,y = label.get_position()

        lblTransform = label.get_text()

        lab = ax.text(x,y-0.13, lblTransform, transform=label.get_transform(),
                      ha=label.get_ha(),va=label.get_va(), fontfamily='monospace')
        lab.set_rotation(angle)
        labels.append(lab)
    ax.set_xticklabels([])
    ax.legend(loc='upper right',bbox_to_anchor=(1.05, 1.05))

    return fig,ax
In [239]:
def makeFlatRosePlot( df , groups, orderedMol , colorD={'ICU COVID' : '#a6611a', 'Non-ICU COVID' : '#dfc27d', 'healthy control' : '#80cdc1','inflammatory control' : '#018571' } ):
    
    angles = np.linspace( 0, 2*np.pi * (1-1/len( orderedMol ))  , len( orderedMol ) )


    fig = plt.figure(figsize = (14,14))
    ax = fig.add_subplot()

    for g in groups:

        #print(g)
        M = df.loc[ g , orderedMol , : ].value

        S = np.array(df.loc[ g , orderedMol , : ]["std"])
        C = np.array(df.loc[ g , orderedMol , : ]["count"])
        SEM = S / np.sqrt(C)

        #print(len(M),len(S),len(C))
        c = ax.plot(angles, M, c= colorD[g] , label=g )
        ax.fill_between( angles , M+SEM , M-SEM , color =colorD[g],  alpha=0.2 )

    ## handling labels    
    ax.set_xticks(angles)

    Xlabels = []
    for a,l in  zip( angles , orderedMol ) :

        m = l.partition('_')[2]
        #if a <= np.pi*0.5 or a > np.pi*1.5:
        #    m = m.ljust(20)
        #else:
        #    m = m.rjust(20)

        Xlabels.append(m)


    ax.set_xticklabels( Xlabels ,
                      fontfamily='monospace', rotation='vertical')
    ax.legend(loc='upper right',bbox_to_anchor=(1.05, 1.05))

    return fig,ax

back to ToC

3. roseplot of imputed data

In [25]:
df_metadata["stageGroup"] = df_metadata.Group
df_metadata.loc[df_metadata.covid==True , "stageGroup"] = df_metadata.Stage[df_metadata.covid == True] 

df_metadata.head()
Out[25]:
Age Sex Group Stage covid stageGroup
studyID
4927 38 f healthy control I False healthy control
5050 53 m healthy control I False healthy control
5052 22 m healthy control I False healthy control
5062 23 m healthy control I False healthy control
5092 47 f healthy control I False healthy control
In [26]:
pd.crosstab( df_metadata.Group , df_metadata.stageGroup )
Out[26]:
stageGroup I II III healthy control inflammatory control
Group
ICU COVID 0 0 11 0 0
Non-ICU COVID 18 7 4 0 0
healthy control 0 0 0 25 0
inflammatory control 0 0 0 0 25
In [27]:
df_molecular_tmp = df_molecular_Z.copy()
df_molecular_tmp["patient"] = df_molecular_tmp.index
df_molecular_Z_long = df_molecular_tmp.melt( id_vars=['patient'])# ,ignore_index=False)
In [28]:
df_molecular_Z_long['stageGroup'] = df_molecular_Z_long.patient.replace( df_metadata.stageGroup )
In [29]:
df_molecular_Z_long.head()
Out[29]:
patient variable value stageGroup
0 6346 CSF_NMNAT1 -0.969989 healthy control
1 4995 CSF_NMNAT1 -0.012901 inflammatory control
2 5204 CSF_NMNAT1 -0.099268 healthy control
3 5652 CSF_NMNAT1 1.177424 inflammatory control
4 NC029 CSF_NMNAT1 -0.115101 I
In [30]:
m = df_molecular_Z_long.variable.isin( PImolec )
In [50]:
fig,ax = plt.subplots(figsize=(14,8))

sns.lineplot(x='variable' , y='value' , data= df_molecular_Z_long.loc[m,:] , hue='stageGroup' , ax=ax)
Out[50]:
<AxesSubplot:xlabel='variable', ylabel='value'>

This is quite better!

Potential improvement : find a clever way to order columns. (most discriminative ? increasing ICU COVID mean?)

In [31]:
x = "variable"
y= "value"
hue='stageGroup'
GB = df_molecular_Z_long.groupby( by=[hue,x] )

test = pd.DataFrame( GB[y].mean() )
test['std'] = GB[y].std()
test['count'] = GB[y].count()
test
Out[31]:
value std count
stageGroup variable
I CSF_4E-BP1 -0.349481 0.756210 16
CSF_ADA -0.290201 0.663098 16
CSF_ADAM 22 0.020418 1.035972 16
CSF_ADAM 23 -0.271007 1.168089 16
CSF_ARTN -0.034452 0.959587 16
... ... ... ... ...
inflammatory control Plasma_VWC2 0.190317 0.949536 25
Plasma_WFIKKN1 0.197604 0.853960 25
Plasma_gal-8 -0.343419 0.504406 25
Plasma_sFRP-3 -0.269308 1.070083 25
Plasma_uPA 0.079342 0.976637 25

1820 rows × 3 columns

In [36]:
groups = df_molecular_Z_long[hue].unique()
## fix the order
groups = ['healthy control','inflammatory control','I','II','III']
In [33]:
ListMolec = PImolec


refG = 'III'
M = list( test.loc[ refG , ListMolec , : ].value )
O = np.argsort(M)

orderedMol = [ PImolec[o] for o in O]
orderedMol += list( set( PImolec ).difference(orderedMol) )
In [37]:
# purple / green
#lut = {'III' :'#762a83',
#    'II' : '#af8dc3',
#       'I' : '#e7d4e8',
#    'healthy control' : '#a6dba0',
#    'inflammatory control' : '#008837' }


# red/yellow / turquoise / blues
lut = {'III' :'#073B4C',
    'II' : '#118AB2',
       'I' : '#06D6A0',
    'healthy control' : '#FFCE5C',
    'inflammatory control' : '#F26989' }




ListMolec = PNmolec


refG = 'III'
M = list( test.loc[ refG , ListMolec , : ].value )
O = np.argsort(M)

orderedMol = [ ListMolec[o] for o in O]
orderedMol += list( set( ListMolec ).difference(orderedMol) )

fig,ax = makeRosePlot( df=test , groups=groups, orderedMol=orderedMol , colorD=lut )
ax.set_title("Plasma Neurology panel\nZ-scores - SEM error bars", loc = 'left')
#fig.savefig("images/rose_Plasma_Neurology_neuroCOVID.colors3.png")
Out[37]:
Text(0.0, 1.0, 'Plasma Neurology panel\nZ-scores - SEM error bars')
In [38]:
# red/yellow / turquoise / blues
lut = {'III' :'#073B4C',
    'II' : '#118AB2',
       'I' : '#06D6A0',
    'healthy control' : '#FFCE5C',
    'inflammatory control' : '#F26989' }


ListMolecs = [PImolec,PNmolec,CImolec,CNmolec]
sources = ['Plasma','Plasma','CSF','CSF']
panels = ['Inflammatory','Neurology','Inflammatory','Neurology']

for i in range(len(ListMolecs)):

    ListMolec = ListMolecs[i]

    source = sources[i]
    panel = panels[i]

    refG = 'III'
    M = list( test.loc[ refG , ListMolec , : ].value )
    O = np.argsort(M)

    orderedMol = [ ListMolec[o] for o in O]
    orderedMol += list( set( ListMolec ).difference(orderedMol) )

    fig,ax = makeRosePlot( df=test , groups=groups, orderedMol=orderedMol , colorD=lut )
    ax.set_title(source +" "+panel+" panel\nZ-scores - SEM error bars", loc = 'left')
    fig.savefig("images/rosePlots/rose_"+source+"_"+panel+"_neuroCOVID"+".pdf")

back to ToC

4. roseplots of marginalized data

In [39]:
## now reading the molecular data
df_molecular = pd.read_csv( "marginalized_data/data_marginalized_neuroCOVID.csv" , index_col=0 )
df_molecular.drop(columns="stageGroup" , inplace=True)
df_molecular.head()
Out[39]:
CSF_NMNAT1 CSF_NRP2 CSF_MAPT CSF_CADM3 CSF_GDNF CSF_UNC5C CSF_VWC2 CSF_Siglec_9 CSF_CLM_6 CSF_EZR ... Plasma_TNFRSF9 Plasma_NT_3 Plasma_TWEAK Plasma_CCL20 Plasma_ST1A1 Plasma_STAMBP Plasma_IL5 Plasma_ADA Plasma_TNFB Plasma_CSF_1
4927 0.783146 7.484550 4.359968 8.166541 0.108638 4.733553 6.869538 4.009678 4.725901 3.476555 ... 5.795753 1.409262 7.279297 5.418392 0.236511 3.412914 -0.174867 3.895639 3.732914 9.023864
5050 0.455615 7.286502 4.645233 8.172515 0.067673 5.256008 7.272363 4.442834 4.684212 2.823498 ... 5.633397 2.027130 7.871435 5.936471 0.262972 3.278133 0.099333 4.329668 3.477081 8.744154
5052 0.594596 6.635136 3.490066 8.149356 0.213948 4.224093 6.016119 3.294286 4.056793 2.549147 ... 6.212265 1.921543 8.165437 5.642695 0.347999 3.302632 -0.193420 4.360193 4.213483 9.061708
5062 0.590112 6.957021 3.770465 8.076262 0.209230 4.521448 6.355951 4.349481 3.955589 3.046600 ... 5.892231 1.807028 7.978211 7.307649 0.345256 3.686129 -0.183977 4.093367 3.674123 8.769930
5092 0.742797 6.781485 2.867935 8.159029 0.066171 3.937601 5.994287 3.071284 3.886460 2.290494 ... 5.617805 0.251490 7.912914 6.293002 2.075751 3.948645 -0.089874 3.962206 3.573228 8.686050

5 rows × 364 columns

In [40]:
def getZscore( c ):
    return ( c - c.mean() ) / c.std(ddof=0)

df_molecular_Z = df_molecular.apply(getZscore)
print("means")
print( df_molecular_Z.mean().describe() )
print("\nstds")
print( df_molecular_Z.std(ddof=0).describe() )
means
count    3.580000e+02
mean    -1.800935e-16
std      1.791420e-15
min     -2.183743e-14
25%     -4.144833e-16
50%     -1.600027e-17
75%      1.530638e-16
max      1.221245e-14
dtype: float64

stds
count    3.580000e+02
mean     1.000000e+00
std      1.034563e-16
min      1.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      1.000000e+00
dtype: float64
In [41]:
df_molecular_tmp = df_molecular_Z.copy()
df_molecular_tmp["patient"] = df_molecular_tmp.index
df_molecular_Z_long = df_molecular_tmp.melt( id_vars=['patient'])# ,ignore_index=False)
In [42]:
df_molecular_Z_long['stageGroup'] = df_molecular_Z_long.patient.replace( df_metadata.stageGroup )
In [43]:
df_molecular_Z_long.head()
Out[43]:
patient variable value stageGroup
0 4927 CSF_NMNAT1 -0.858822 healthy control
1 5050 CSF_NMNAT1 -1.050542 healthy control
2 5052 CSF_NMNAT1 -0.969190 healthy control
3 5062 CSF_NMNAT1 -0.971814 healthy control
4 5092 CSF_NMNAT1 -0.882441 healthy control
In [44]:
m = df_molecular_Z_long.variable.isin( PImolec )
In [45]:
x = "variable"
y= "value"
hue='stageGroup'
GB = df_molecular_Z_long.groupby( by=[hue,x] )

test = pd.DataFrame( GB[y].mean() )
test['std'] = GB[y].std()
test['count'] = GB[y].count()
test
Out[45]:
value std count
stageGroup variable
I CSF_4E_BP1 -0.339211 0.747433 16
CSF_ADA -0.302486 0.634828 16
CSF_ADAM_22 0.012893 1.025575 16
CSF_ADAM_23 -0.252458 1.155805 16
CSF_ARTN -0.000360 0.941429 16
... ... ... ... ...
inflammatory control Plasma_VWC2 0.190078 0.785582 25
Plasma_WFIKKN1 0.242669 0.747947 25
Plasma_gal_8 -0.353377 0.576010 25
Plasma_sFRP_3 -0.277916 1.091565 25
Plasma_uPA 0.073625 0.980476 25

1820 rows × 3 columns

In [46]:
groups = df_molecular_Z_long[hue].unique()
## fix the order
groups = ['healthy control','inflammatory control','I','II','III']
In [47]:
ListMolec = PImolec


refG = 'III'
M = list( test.loc[ refG , ListMolec , : ].value )
O = np.argsort(M)

orderedMol = [ PImolec[o] for o in O]
orderedMol += list( set( PImolec ).difference(orderedMol) )
In [49]:
# red/yellow / turquoise / blues
lut = {'III' :'#073B4C',
    'II' : '#118AB2',
       'I' : '#06D6A0',
    'healthy control' : '#FFCE5C',
    'inflammatory control' : '#F26989' }

# we replace special characters that were removed duting marginalization
PImolec = ["Plasma_" + x.replace('-','_').replace(' ','_') for x in inflammatoryMolecules]
CImolec = ["CSF_" + x.replace('-','_').replace(' ','_') for x in inflammatoryMolecules]
CNmolec = ["CSF_" + x.replace('-','_').replace(' ','_') for x in neuroMolecules]
PNmolec = ["Plasma_" + x.replace('-','_').replace(' ','_') for x in neuroMolecules]

ListMolecs = [PImolec,PNmolec,CImolec,CNmolec]
sources = ['Plasma','Plasma','CSF','CSF']
panels = ['Inflammatory','Neurology','Inflammatory','Neurology']

for i in range(len(ListMolecs)):

    ListMolec = ListMolecs[i]

    source = sources[i]
    panel = panels[i]

    refG = 'III'
    M = list( test.loc[ refG , ListMolec , : ].value )
    O = np.argsort(M)

    orderedMol = [ ListMolec[o] for o in O]
    orderedMol += list( set( ListMolec ).difference(orderedMol) )

    fig,ax = makeRosePlot( df=test , groups=groups, orderedMol=orderedMol , colorD=lut )
    ax.set_title(source +" "+panel+" panel\nZ-scores - SEM error bars", loc = 'left')
    fig.savefig("images/rosePlots_marginal/rose_marginal_"+source+"_"+panel+"_neuroCOVID"+".pdf")
In [ ]: